/** tabdata2.do

This do-file takes the tabular data from various sources disaggregated into 
urban district, rural county and county city units and merges it together to create
the data set us123.dta.

**/

clear
set mem 100m
set more off
capture log close
log using tabdata2.log, replace text


****************** 1. Prepare Data Sets to Merge to Correspondence Table *************************

** Individual Yearbooks
use ..\..\data\tabular_data_BJ\source\ind_yb_rz.dta
drop if unit_status>3
***Delete the 4 duplicate obs: Second obs generally has numbers*100
sort unit_code_08 year gdp
by unit_code_08 year: keep if _n==1
***Fix these unit_status which are wrong
replace unit_status=5 if unit_code_08==130200 & year==1995
replace unit_status=5 if unit_code_08==130200 & year==2000
replace unit_status=5 if unit_code_08==130200 & year==2005
replace unit_status=4 if unit_code_08==130201 & year==1995
replace unit_status=4 if unit_code_08==130201 & year==2000
replace unit_status=4 if unit_code_08==130201 & year==2005
replace unit_status=5 if unit_code_08==130400 & year==1995
replace unit_status=4 if unit_code_08==130401 & year==1995
replace unit_status=5 if unit_code_08==510300 & year==2000
replace unit_status=4 if unit_code_08==510301 & year==2000
replace unit_status=1 if unit_code_08==341602 & year==2000
drop if unit_status>3
*** This obs was wrong
replace unit_code_08=341600 if unit_code_08==341602 & year==2000
***Extra obs with no information anyways
drop if unit_code_08==530101 & unit_status==1
sort unit_code_08 year
save temp_indyb.dta, replace

**Michigan Online
**Get county cities that were in the michigan data
use ..\..\data\tabular_data_BJ\source\MI_4YCP.dta
keep if city_code==441900 | city_code==442000 | city_code==341600
sort city_code year
save temp_mi3ycp.dta, replace

use ..\..\data\tabular_data_BJ\source\MI_ADCP.dta
keep if city_code==441900 | city_code==442000 | city_code==341600
keep if year==2000|year==2005|year==2008|year==2010
sort city_code year
merge using temp_mi3ycp.dta
tab _merge
drop _merge
replace unit_code_08 = 441900 if city_code==441900
replace unit_code_08 = 442000 if city_code==442000
replace unit_code_08 = 341600 if city_code==341600 & year==2000
replace unit_code_08 = 341602 if city_code==341600 & year>2000
rename total_pop pop
append using ..\..\data\tabular_data_BJ\source\MI_4YRC.dta
** These are prefectures that we decided to drop
drop if city_code==640100|city_code==640200|unit_code_08==640181
rename gdp gdp_michigan
rename avg_salary_of_staff_worker avgsalary_michigan

sort unit_code_08 year
save temp_mi.dta, replace

*** Capital Investment data
use ..\..\data\tabular_data_BJ\source\MI_InvCap00_01.dta
keep unit_code_08 inv_cap year
reshape wide inv_cap, i(unit_code_08) j(year)
gen year = 2000
sort unit_code_08 year
merge unit_code_08 year using temp_mi.dta
tab year _merge
** the one _merge=1 is OK
drop _merge
sort unit_code_08 year
save temp_mi.dta, replace

**1997 Villagers Committees
use ..\..\data\tabular_data_BJ\source\MI_vc97_final.dta
*** Drop unusable data b/c of changes 95-97
*** Each is the result of a split, so data only partially covers a common code region
drop if unit_code_08==431221
drop if unit_code_08==445323
drop if unit_code_08==450681
drop if unit_code_08==450924
drop if unit_code_08==510626
drop if unit_code_08==623001
replace year = 1995
** These are prefectures that we decided to drop
drop if city_code==640100|city_code==640200
sort unit_code_08 year
*** DROP REPEATED OBS, TO BE CLEANED UP LATER
by unit_code_08 year: keep if _n==1
sort unit_code_08 year
save temp_vc97.dta, replace

**Fiscal 1995
use ..\..\data\tabular_data_BJ\source\fiscal_95_rz.dta
*** These codes were wrong
replace unit_code_08 = 130902 if unit_code_08==130105 & city_code==130900
drop if unit_code_08 == 130302 & city_code==130200
replace unit_code_08 = 210303 if unit_code_08==210106 & city_code==210300
replace unit_code_08 = 321324 if unit_code_08==321323 & rural_pop==85
replace unit_name = "Sihong County" if unit_code_08==321324
replace gdp = . if gdp<0
replace gdp_sector2 = . if gdp_sector2<0
rename gdp gdp_f95
rename gdp_sector2 gdp_sector2_f95
rename pop pop_f95
rename rural_pop rural_pop_f95
sort unit_code_08 year
save temp_f95.dta, replace

**Fenxian 1990 (rural counties)
use ..\..\data\tabular_data_BJ\source\fenxian_1990_rz.dta
*** We can't merge this obs and it has no info on any variables
drop if county_name=="Qujing City" & unit_code_08==-9 & year==1990
*** These are repeated obs with a few numbers we won't use that are different
drop if unit_code_08==231024 & unit_status==3
drop if unit_code_08==231025 & fertilizer_1990==20213
*** Rename these variables that overlap with those from ind_yb data
rename gdp gdp_fenxian
rename gdp_sec1 gdp_sector1_fenxian
rename gdp_sec2 gdp_sector2_fenxian
rename agr_mp agr_mp_fenxian
rename ruralpop ruralpop_fenxian
rename rurallabor rurallabor_fenxian
sort unit_code_08 year
save temp_fenxian.dta, replace
***Create alternative version to merge on names
keep if unit_code_08==-9
gen mrg1=1
rename county_name unit_name
sort mrg1 unit_name year
save temp_fenxian2.dta, replace

************************************************************
************************************************************
** Professor Zhang's 1. supplement_xian_updated.csv asset data  (HY 08/09/12)

insheet using ..\..\data\tabular_data_BJ\source\asset_supplement_xian.csv, clear
tab unit_status

*** rename variables (add qz & shorted them)
rename gross_asset_cp asset_g_qz
rename net_asset_cp asset_n_qz

drop if asset_g_qz==. & asset_n_qz==.
keep year unit_code_08 *_qz

sort unit_code_08 year
save temp_qz1.dta, replace

*** This is for use in predicting net assets in 2000
insheet using ..\..\data\tabular_data_BJ\source\asset_dijishi.csv, clear
drop if net_asset_cp==net_asset_pref
drop if net_asset_pref==.
keep if year==2000
keep city05 year net_asset_pref net_asset_cp
sort city05 year
save assets00.dta, replace


************************************************************
************************************************************
** Professor Zhang's 2. county_23_6.csv asset data (HY 08/09/12)

insheet using ..\..\data\tabular_data_BJ\source\county_23_6.csv, clear
rename uc_08 unit_code_08

*** rename variables (add qz & shorted them)
rename gross_asset_cp asset_g_qz
rename net_asset_cp asset_n_qz
rename gdp_cp gdp_qz
rename gdp_s1_cp gdp_s1_qz
rename gdp_s2_cp gdp_s2_qz
rename gdp_s3_cp gdp_s3_qz
rename fdi_cp fdi_qz
rename soe_output_cp soe_output_qz
rename total_output_cp total_output_qz

/* left for future use; some gdp and fdi variables are here
rename gross_asset_pref10e4 asset_g_qz_pf
rename net_asset_pref asset_n_qz_pf
rename *_pref *_qz_pf
rename *_pref10e4 *_qz_pf
rename gross_asset_* asset_g_*
rename net_asset_* asset_n_*
rename gdp10e8_qz_pf gdp_qz_pf
rename fdi10e4usd_qz_pf fdi_qz_pf
*/

keep unit_code_08 *_qz

gen year=1990

sort unit_code_08 year
save temp_qz2.dta, replace



/*** Printed Yearbooks have some data for county cities up to 1997 (use only 90,95)
Identify obs in corr file that are potential PY obs.  Use city00 as the merge
variable because the PY data is set up that way. **/
use ..\..\data\correspondence_tables\generated\correspondence_82_10.dta
keep if year<=2005
egen urban05 = max((unit_status==1)*(year==2005)), by(unit_code_common)
egen us = max(unit_status==1), by(city_code year)
replace urban05 = 0 if us==1
#delimit ;
keep if urban05==1|unit_code_08==341602|unit_code_08==422801|unit_code_08==431102
|unit_code_08==441900|unit_code_08==442000|unit_code_08==451002|unit_code_08==451202
|unit_code_08==522201|unit_code_08==530601|unit_code_08==620602|unit_code_08==620702
|unit_code_08==620802|unit_code_08==620902|unit_code_08==321200|unit_code_08==321300
|(unit_code_08==371200&year==1990)|(unit_code_08==445100&year==1990);
#delimit cr
keep if unit_status==3 & year<2000
drop if unit_code_08==431103|unit_code_08==330110|unit_code_08==431103|unit_code_08==350681|unit_code_08==500066
sort city05 year
by city05 year: gen obs = _N
tab obs
drop if obs>1
keep city05 unit_code_08 year city_name unit_name
sort city05 year
save tempcorr.dta, replace

use ..\..\data\tabular_data_BJ\source\PY_ADCP_final_rz.dta
sort unit_code year
save ..\..\data\tabular_data_BJ\source\PY_ADCP_final_rz.dta, replace
use ..\..\data\tabular_data_BJ\source\PY_2YCP_final_rz.dta
sort unit_code year 
merge unit_code year using ..\..\data\tabular_data_BJ\source\PY_ADCP_final_rz.dta, update
tab _merge
drop _merge
keep if year==1990|year==1995
gen city05 = unit_code
replace city05 = unit_code-100 if city05-10*int(city05/10)==0
replace city05 = unit_code-1 if city05-10*int(city05/10)==1
sort city05 year
merge city05 year using tempcorr.dta
tab _merge
keep if _merge==3
drop _merge unit_code city05 agremp-pubemp
rename gdp gdp_py
rename fdi fdi_py
rename givo givo_py
sort unit_code_08 year
save temp_2ycp.dta, replace

*** Provincial GDP Data from 1990
use ..\..\data\tabular_data_BJ\source\yearbook90.dta
keep if year==1990
** This is Chongqing, part of Sichuan until 1997
drop if province_code==500000
keep province_code gdp_total gdp_sec1 gdp_sec2 gdp_sec3
rename gdp_total pgdp
rename gdp_sec1 pgdp_sec1
rename gdp_sec2 pgdp_sec2
rename gdp_sec3 pgdp_sec3
rename province_code province_code90
gen year = 1990
sort province_code90 year
save temp_p90.dta, replace


/****** 2. Merge Data Sets to Correspondence Table Using 
	Urban District, Rural County, County City Units Only ********/

use ..\..\data\correspondence_tables\generated\correspondence_82_10.dta

**** Individual Yearbook Data
sort unit_code_08 year
merge unit_code_08 year using temp_indyb.dta
sort year
by year: tab unit_status _merge
rename _merge mrg_ind
sort unit_status unit_code_08
/*** These are the individual yearbook codes for which 
there is no code in the correspondence file and for whom we do
not have a geography, even those listed for 2005.  Because the
remaining units merged perfectly in each city code affected, drop these. ****/
l unit_code_08 unit_name unit_status year if mrg_ind==2
drop if mrg_ind==2

**** Michigan Online Data (rural county and city data, 2000-2008 Only)
sort unit_code_08 year
merge unit_code_08 year using temp_mi.dta
tab unit_status _merge if year>1995
rename _merge mrg_MI3Yrc
** These missings are backed out by comparing the PF and county data
* Urban Units
replace gdp_michigan = 37.96 if unit_code_08==140602 & year==2000
replace gdp_michigan = 63.75 if unit_code_08==360313 & year==2000
replace gdp_michigan = 73.23 if unit_code_08==440660 & year==2000
replace gdp_michigan = 88.73 if unit_code_08==140602 & year==2005
replace gdp_michigan = 17.66 if unit_code_08==230624 & year==2005
replace gdp_michigan = 129.98 if unit_code_08==360313 & year==2005
*Rural Units
replace gdp_michigan = 49.31 if unit_code_08==230111 & year==2000
replace gdp_michigan = 138.95 if unit_code_08==320506 & year==2000
replace gdp_michigan = 37.37 if unit_code_08==321311 & year==2000
replace gdp_michigan = 97.6 if unit_code_08==350304.5 & year==2000
replace gdp_michigan = 24.76 if unit_code_08==640502 & year==2000
*** THese are two adjacent special districts which I split residual GDP evenly across
replace gdp_michigan = 13.31 if (unit_code_08==130904|unit_code_08==130905)& year==2005
replace gdp_michigan = 13.82 if unit_code_08==410188 & year==2005
** Special rural district
replace gdp_michigan = 23.43 if unit_code_08==520203 & year==2005
** Assign remaining special districts GDP of 0
replace gdp_michigan = 0 if unit_status==-1 & year>=2000 & gdp_michigan==.

sort year unit_status unit_code_08
*** These are the rural counties and county cities that should be in the MI data but are not
l unit_code_08 city_name city_code unit_name unit_status year if mrg_MI3Yrc==1 & year>1995 & unit_status>1 & gdp_michigan==.

**** VC97 Data
sort unit_code_08 year
merge unit_code_08 year using temp_vc97.dta
tab unit_status _merge if year==1995
rename _merge mrg_VC97

**** FY95 Data
sort unit_code_08 year
merge unit_code_08 year using temp_f95.dta, update
tab _merge if year==1995
** There are no names for these obs so they can't be merged
drop if _merge==2
rename _merge mrg_f95

**** PY Data
sort unit_code_08 year
merge unit_code_08 year using temp_2ycp.dta
tab year _merge if year<2000
rename _merge mrg_2ycp

**** Fenxian 1990 Data (rural county and county city data, 1990 Only)
sort unit_code_08 year
merge unit_code_08 year using temp_fenxian.dta
tab _merge unit_status if year==1990 & unit_status>1
rename _merge mrg_fenxian
tab unit_code_08 if mrg_fenxian==2
drop if mrg_fenxian==2
***** For the bad merges, we can merge on unit name instead
gen mrg1 = (mrg_fenxian==1 & year==1990 & unit_status>1)
sort mrg1 unit_name year
by mrg1 unit_name year: gen obs = _N
replace mrg1 = 0 if obs>2
sort mrg1 unit_name year
merge mrg1 unit_name year using temp_fenxian2.dta, update
tab _merge unit_status if year==1990 & unit_status>1 & mrg1==1
replace mrg_fenxian=3 if _merge==5
drop if _merge==2
replace mrg_fenxian = 3 if _merge==5
drop _merge

** supplement_xian
sort unit_code_08 year
merge unit_code_08 year using temp_qz1.dta
tab year _merge
rename _merge mrg_qz1

** county_23_6.csv extra data on some county cities
sort unit_code_08 year
merge unit_code_08 year using temp_qz2.dta, update
tab year _merge
rename _merge mrg_qz2

**** Consolidate these gdp variables
replace gdp_py = gdp_qz if gdp_py==. | gdp_py==-9
replace gdp_sect2 = gdp_s2_qz if gdp_sect2==. | gdp_sect2==-9
replace gdp_sect3 = gdp_s3_qz if gdp_sect3==. | gdp_sect3==-9
drop gdp_qz gdp_s1_qz gdp_s2_qz gdp_s3_qz

**** Merge on CP and PF level asset data 
sort city05 year
merge city05 year using assets00.dta
tab year _merge 
drop _merge
egen sinv_cap2000 = sum(inv_cap2000), by(city05 year)
replace asset_n_qz = (net_asset_pref-net_asset_cp)*(inv_cap2000/sinv_cap2000) if asset_n_qz==.
drop net_asset_cp net_asset_pref sinv_cap2000

**** Merge on asset data that was recorded for yet to be promoted CPs
sort city05 year
merge city05 year using ../../data/tabular_data_BJ/generated/asset_data.dta
tab _merge
gen mrgobs = cp90==1 & (unit_status==1 | unit_status==3)
egen Mrgobs = max(mrgobs), by(city05 year)
tab Mrgobs if _merge==3
replace asset_n_qz = asset_n_qzx if cp90==1 & (unit_status==1|unit_status==3)
replace asset_g_qz = asset_g_qzx if cp90==1 & (unit_status==1|unit_status==3)
replace asset_n_qz = asset_n_qzx if cp90==1 & unit_name=="Qujing City"
replace asset_g_qz = asset_g_qzx if cp90==1 & unit_name=="Qujing City"
drop asset_n_qzx asset_g_qzx mrgobs Mrgobs _merge

*** Recode error values to missing
#delimit ;
mvdecode totemp-fixinvest agr_mp ruralpop rurallabor agr_rl cularea fertilizer_1990 rurele 
gdp_fenxian gdp_sector1_fenxian gdp_sector2_fenxian pop_f95 rural_pop_f95 fdi_py
ruralpop_fenxian rurallabor_fenxian agr_mp_fenxian, mv(-9);
#delimit cr
drop unit province county_name

*** Create sector 3 fenxian variable
gen gdp_sector3_fenxian = gdp_fenxian-gdp_sector1_fenxian-gdp_sector2_fenxian
replace gdp_sector3_fenxian = 0 if gdp_sector3_fenxian<0

*** Merge on provincial level GDP data
gen province_code90 = province_code
replace province_code90 = 510000 if province_code90==500000
sort province_code90 year
merge province_code90 year using temp_p90.dta
tab _merge if year==1990
drop _merge

** Drop variables with no obs
drop mrg1 obs r08_nnp aproad bustrly numhins

*** Populate city_name variable over all urban observations
sort city_code city_name
by city_code: replace city_name = city_name[1]
replace city_name = "Hengshui" if city_code==131100
replace city_name = "Rizhao" if city_code==371100

*** Label variables
label variable gdp "(ind yrbk)"
label variable nnp "(ind yrbk)"
label variable gdp_sector1 "(ind yrbk)"
label variable nnp_sector1 "(ind yrbk)"
label variable gdp_sector2 "(ind yrbk)"
label variable nnp_sector2 "(ind yrbk)"
label variable gdp_sector3 "(ind yrbk)"
label variable nnp_sector3 "(ind yrbk)"
label variable tot_pop "(ind yrbk)"
label variable r08_gdp "(ind yrbk)"
label variable r08_gdp_sector2 "(ind yrbk)"
label variable r08_nnp_sector2 "(ind yrbk)"
label variable r08_gdp_sector3 "(ind yrbk)"
label variable r08_nnp_sector3 "(ind yrbk)"
label variable r08_tot_pop "(ind yrbk)"
label variable arrived_fdi "(ind yrbk)"
label variable tot_emp "(ind yrbk)"
label variable tot_emp_sect2 "(ind yrbk)"
label variable tot_emp_sect3 "(ind yrbk)"
label variable num_car "(ind yrbk)"
label variable dust_ems "(ind yrbk)"
label variable so2 "(ind yrbk)"
label variable giov "(ind yrbk)"
label variable giov_soecol "(ind yrbk)"
label variable tot_area "(ind yrbk)"
label variable giov_new "(ind yrbk)"
label variable giov_soecol_new "(ind yrbk)"
label variable agr_mp "Total Agricultural Machinery Power (10000 Kw) (Michigan)"
label variable pop "Population at Year-end (10000 persons) (Michigan)"
label variable ruralpop "Rural Population at yr end (10000 persons) (Michigan)"
label variable rurallabor "Number of rural laborers (10000 persons) (Michigan)"
label variable gdp_michigan "gdp (100 millian yuan) (Michigan)"
label variable va_sec1 "Value Added of Primary Industry (100 million yuan) (Michigan)"
label variable va_sec2 "Value Added of Secondary Industry (100 million yuan) (Michigan)"
label variable expval "Value of Exports (1,000 US Dollars) (Michigan)"
label variable inv_cap2000 "Capital investment, 2000"
label variable inv_cap2001 "Capital investment, 2001"
label variable agr_rl "(fenxian)"
label variable cularea "(fenxian)"
label variable fertilizer_1990 "(fenxian)"
label variable rurele "(fenxian)"
label variable gdp_fenxian "(fenxian)"
label variable gdp_sector1_fenxian "(fenxian)"
label variable gdp_sector2_fenxian "(fenxian)"
label variable gdp_f95 "(95 fiscal yearbook)"
label variable gdp_sector2_f95 "(95 fiscal yearbook)"
label variable pop_f95 "(95 fiscal yearbook)"
label variable rural_pop_f95 "(95 fiscal yearbook)"
label variable totemp " (py)"
label variable gdp_py " (py)"
label variable gdp_sect2 " (py)"
label variable gdp_sect3 " (py)"
label variable total_pop " (py)"
label variable kmpr " (py)"
label variable apr " (py)"
label variable prpc " (py)"
label variable num_bt " (py)"
label variable avgsalary " (py)"
label variable num_colstd " (py)"
label variable givo " (py)"
label variable givo_soe " (py)"
label variable givo_colt " (py)"
label variable exp_lgt " (py)"
label variable num_hedu " (py)"
label variable publibcol " (py)"
label variable culland " (py)"
label variable area " (py)"
label variable emp_sect2 " (py)"
label variable emp_sect3 " (py)"
label variable fixinvest " (py)"
label variable fdi_py " (py)"
label variable givo_py " (py)"
label variable asset_g_qz " gross value of assets (py) "
label variable asset_n_qz " net value of assets (py) "
label variable pgdp "Provincial GDP"
label variable pgdp_sec1 "Provincial GDP, Sector 1"
label variable pgdp_sec2 "Provincial GDP, Sector 2"
label variable pgdp_sec3 "Provincial GDP, Sector 3"

*** Special districts typically only record GDP and not population -> set pop to 0
replace pop = 0 if unit_status==0 | unit_status==-1
replace pop_f95 = 0 if unit_status==0 | unit_status==-1
replace tot_pop = 0 if unit_status==0 | unit_status==-1

*** This is the full disaggregate data set
sort unit_code_08 year
save ..\..\data\tabular_data_BJ\generated\us123.dta, replace

log close

erase temp_fenxian.dta
erase temp_mi.dta
erase temp_indyb.dta
erase temp_vc97.dta
erase temp_f95.dta
erase temp_fenxian2.dta
erase tempcorr.dta

erase temp_mi3ycp.dta
erase temp_2ycp.dta
erase temp_qz1.dta
erase temp_qz2.dta
erase temp_p90.dta
erase assets00.dta













